library(dplyr)
# For Model fitting
library(lme4)
library(purrr)
# For diagnostics
library(performance)
# For adding new columns
library(tibble)
library(caret)
# Load data
sys.source("./scripts/code_join_data_full_dataset.R", envir = knitr::knit_global())
# Load functions
## Models
sys.source("./R/functions_models.R", envir = knitr::knit_global())

Select performance and traits variables

Transform variables

Backtranformation info

Back transformation

Even though you’ve done a statistical test on a transformed variable, such as the log of fish abundance, it is not a good idea to report your means, standard errors, etc. in transformed units. A graph that showed that the mean of the log of fish per 75 meters of stream was 1.044 would not be very informative for someone who can’t do fractional exponents in their head. Instead, you should back-transform your results. This involves doing the opposite of the mathematical function you used in the data transformation. For the log transformation, you would back-transform by raising 10 to the power of your number. For example, the log transformed data above has a mean of 1.044 and a 95% confidence interval of ±0.344 log-transformed fish. The back-transformed mean would be 101.044=11.1 fish. The upper confidence limit would be 10(1.044+0.344)=24.4 fish, and the lower confidence limit would be 10(1.044-0.344)=5.0 fish. Note that the confidence interval is not symmetrical; the upper limit is 13.3 fish above the mean, while the lower limit is 6.1 fish below the mean. Also note that you can’t just back-transform the confidence interval and add or subtract that from the back-transformed mean; you can’t take 100.344 and add or subtract that.

Square-root transformation. This consists of taking the square root of each observation. The back transformation is to square the number. If you have negative numbers, you can’t take the square root; you should add a constant to each number to make them all positive.

People often use the square-root transformation when the variable is a count of something, such as bacterial colonies per petri dish, blood cells going through a capillary per minute, mutations per generation, etc.

# Box-Cox transformation was previously run 

data_for_models_transformed <- 
    data_for_models %>% 
        
        mutate(
            # Plant's performance
            #total_biomass_sqrt = sqrt(total_biomass),
            #above_biomass_sqrt = sqrt(above_biomass),
            #below_biomass_log = log(below_biomass),
            #agr_log = log(agr),
            
            # NO TRANSFORMATION variable already in log-log
            #rgr = rgr,
            
            # NO TRANSFORMATION variable already in log-log
            #rgr_slope = rgr_slope,
          
            # Traits
            amax_log = log(amax),
            gs_log = log(gs),
            wue_log = log(wue),
            
            # d13 and d15 where not transformed because the data has negative values
            pnue_log = log(data_for_models$pnue),
          
            # Covariate
            init_height = log(init_height)) %>%   
            
        # Remove original variables (non-transformed)
        dplyr::select(spcode, treatment, nfixer, init_height, everything(),
               -c(8,10,11:13,16))    

Models: Questions 1 and 2

\[response\sim treatment*fixer\ + initial\ height + random( 1|\ specie)\]

# Take response variables' names 
response_vars_q1_q2 <- 
    set_names(names(data_for_models_transformed)[5:(ncol(data_for_models_transformed))])
#data_for_models_transformed[data_for_models_transformed$id == 48,]
models_list_q1_q2 <- map(response_vars_q1_q2, ~ mixed_model_1(response = .x, 
                                                              data = data_for_models_transformed))

Models Nodule count

# Load data
# This step was done like this because I am working with a subset of the data
# source cleaned data
source("./scripts/code_clean_data_nodules.R")

# Delete unused variables
data_nodules_cleaned <-
    data_nodules_cleaned %>%
        
        # add id to rownames for keep track of the rows
        column_to_rownames("id") %>% 
        dplyr::select(spcode,treatment, everything())

m4 lmer gaussian

lmer_gaussian <- lmer(number_of_root_nodulation ~ treatment + init_height + 
                           (1 |spcode),
                       data = data_nodules_cleaned)
lmer_gaussian_log <-  lmer(log(number_of_root_nodulation) ~ treatment + init_height + 
                           (1 |spcode),
                       data = data_nodules_cleaned)

m5 glmer poisson

glmer_poisson <-  glmer(number_of_root_nodulation ~ treatment + init_height + 
                           (1 |spcode), family = "poisson",
                       data = data_nodules_cleaned)
models_list_nodule_count <- list(lmer_gaussian, lmer_gaussian_log, glmer_poisson)
names(models_list_nodule_count) <- c("lmer_gaussian",
                                     "lmer_gaussian_log",
                                     "glmer_poisson")

Mycorrhizal colonization

I decided not to include it, because I want to focus on Nfixing vrs non-Fixing, 
also I don't trust on the data

Models: Question 3

\[performance\sim treatment*\ fixer*\ scale(log(trait)\ + initial\ height + random( 1|\ specie)\]

Scale preditors

data_for_models_transformed_scaled <-         
    data_for_models_transformed %>% 
    
    # test for being sure that I select the traits
    #select(c(4,7,8,13:16))

        # scale only the predictors traits
         mutate(across(c(4,9:14), scale))
# Select traits (x_vars)
traits_names <- 
    set_names(names(data_for_models_transformed_scaled)
                          [c(9:14)])
traits_names
      d13c       d15n   amax_log     gs_log    wue_log   pnue_log 
    "d13c"     "d15n" "amax_log"   "gs_log"  "wue_log" "pnue_log" 
# Select plants performance vars (y_vars)
performance_names <- 
    set_names(names(data_for_models_transformed_scaled)[c(5:8)])

performance_names
  total_biomass   above_biomass   below_biomass             rgr 
"total_biomass" "above_biomass" "below_biomass"           "rgr" 
models_lmer_formulas <- model_combinations_formulas(performance_names, traits_names)

length(models_lmer_formulas)
[1] 24
models_lmer_formulas[1]
$`abovebiomass~amaxlog`
above_biomass ~ treatment:nfixer:amax_log + init_height + (1 | 
    spcode)
<environment: 0x55cc8e5900e8>
models_list_q3 <- map(models_lmer_formulas, 
                       ~ lmer(.x, data = data_for_models_transformed_scaled))

Validation plots

Collinearity

map(models_list_q1_q2, check_collinearity)
$total_biomass
# Check for Multicollinearity

Low Correlation

             Term  VIF Increased SE Tolerance
        treatment 3.94         1.99      0.25
           nfixer 1.13         1.06      0.88
      init_height 1.08         1.04      0.92
 treatment:nfixer 4.12         2.03      0.24

$above_biomass
# Check for Multicollinearity

Low Correlation

             Term  VIF Increased SE Tolerance
        treatment 3.96         1.99      0.25
           nfixer 1.07         1.04      0.93
      init_height 1.09         1.04      0.92
 treatment:nfixer 3.98         2.00      0.25

$below_biomass
# Check for Multicollinearity

Low Correlation

             Term  VIF Increased SE Tolerance
        treatment 3.93         1.98      0.25
           nfixer 1.17         1.08      0.85
      init_height 1.08         1.04      0.92
 treatment:nfixer 4.21         2.05      0.24

$rgr
# Check for Multicollinearity

Low Correlation

             Term  VIF Increased SE Tolerance
        treatment 3.97         1.99      0.25
           nfixer 1.04         1.02      0.96
      init_height 1.09         1.04      0.92
 treatment:nfixer 3.92         1.98      0.26

$d13c
# Check for Multicollinearity

Low Correlation

             Term  VIF Increased SE Tolerance
        treatment 3.92         1.98      0.25
           nfixer 1.21         1.10      0.83
      init_height 1.08         1.04      0.93
 treatment:nfixer 4.29         2.07      0.23

$d15n
# Check for Multicollinearity

Low Correlation

        Term  VIF Increased SE Tolerance
   treatment 3.87         1.97      0.26
      nfixer 1.73         1.32      0.58
 init_height 1.07         1.03      0.94

Moderate Correlation

             Term  VIF Increased SE Tolerance
 treatment:nfixer 5.53         2.35      0.18

$amax_log
# Check for Multicollinearity

Low Correlation

             Term  VIF Increased SE Tolerance
        treatment 3.96         1.99      0.25
           nfixer 1.07         1.04      0.93
      init_height 1.09         1.04      0.92
 treatment:nfixer 3.98         2.00      0.25

$gs_log
# Check for Multicollinearity

Low Correlation

        Term  VIF Increased SE Tolerance
   treatment 3.85         1.96      0.26
      nfixer 2.01         1.42      0.50
 init_height 1.06         1.03      0.94

Moderate Correlation

             Term  VIF Increased SE Tolerance
 treatment:nfixer 6.21         2.49      0.16

$wue_log
# Check for Multicollinearity

Low Correlation

             Term  VIF Increased SE Tolerance
        treatment 3.90         1.97      0.26
           nfixer 1.36         1.17      0.74
      init_height 1.07         1.04      0.93
 treatment:nfixer 4.64         2.15      0.22

$pnue_log
# Check for Multicollinearity

Low Correlation

             Term  VIF Increased SE Tolerance
        treatment 3.90         1.98      0.26
           nfixer 1.33         1.15      0.75
      init_height 1.08         1.04      0.93
 treatment:nfixer 4.58         2.14      0.22
map(models_list_nodule_count, check_collinearity)
$lmer_gaussian
# Check for Multicollinearity

Low Correlation

        Term  VIF Increased SE Tolerance
   treatment 1.06         1.03      0.94
 init_height 1.06         1.03      0.94

$lmer_gaussian_log
# Check for Multicollinearity

Low Correlation

        Term  VIF Increased SE Tolerance
   treatment 1.06         1.03      0.95
 init_height 1.06         1.03      0.95

$glmer_poisson
# Check for Multicollinearity

Low Correlation

        Term  VIF Increased SE Tolerance
   treatment 1.08         1.04      0.93
 init_height 1.08         1.04      0.93
#Warning: Model has interaction terms. VIFs might be inflated. You may check 
#multicollinearity among predictors of a model without interaction terms.

#map(models_list_q3, check_mul)

Bolker’s plots

bolker_validation <- function(model) {
    
    
    a <- plot(model, type = c("p", "smooth"))
    
    ## heteroscedasticity
    b <-     plot(model,sqrt(abs(resid(.))) ~ fitted(.), type = c("p", "smooth"))
   
    cowplot::plot_grid(a,b,nrow = 1)
}

Models for questions 1,2

map(models_list_q1_q2, bolker_validation)
$total_biomass


$above_biomass


$below_biomass


$rgr


$d13c


$d15n


$amax_log


$gs_log


$wue_log


$pnue_log

Models for nodule count

map(models_list_nodule_count, bolker_validation)
$lmer_gaussian


$lmer_gaussian_log


$glmer_poisson

Models for question 3

map(models_list_q3, bolker_validation)
$`abovebiomass~amaxlog`


$`abovebiomass~d13c`


$`abovebiomass~d15n`


$`abovebiomass~gslog`


$`abovebiomass~pnuelog`


$`abovebiomass~wuelog`


$`belowbiomass~amaxlog`


$`belowbiomass~d13c`


$`belowbiomass~d15n`


$`belowbiomass~gslog`


$`belowbiomass~pnuelog`


$`belowbiomass~wuelog`


$`rgr~amaxlog`


$`rgr~d13c`


$`rgr~d15n`


$`rgr~gslog`


$`rgr~pnuelog`


$`rgr~wuelog`


$`totalbiomass~amaxlog`


$`totalbiomass~d13c`


$`totalbiomass~d15n`


$`totalbiomass~gslog`


$`totalbiomass~pnuelog`


$`totalbiomass~wuelog`

Performance package

Models for questions 1,2

map(models_list_q1_q2, check_model)
$total_biomass


$above_biomass


$below_biomass


$rgr


$d13c


$d15n


$amax_log


$gs_log


$wue_log


$pnue_log

Models for nodule count

map(models_list_nodule_count, check_model)
$lmer_gaussian


$lmer_gaussian_log


$glmer_poisson

Models for question 3

map(models_list_q3, check_model)
$`abovebiomass~amaxlog`


$`abovebiomass~d13c`


$`abovebiomass~d15n`


$`abovebiomass~gslog`


$`abovebiomass~pnuelog`


$`abovebiomass~wuelog`


$`belowbiomass~amaxlog`


$`belowbiomass~d13c`


$`belowbiomass~d15n`


$`belowbiomass~gslog`


$`belowbiomass~pnuelog`


$`belowbiomass~wuelog`


$`rgr~amaxlog`


$`rgr~d13c`


$`rgr~d15n`


$`rgr~gslog`


$`rgr~pnuelog`


$`rgr~wuelog`


$`totalbiomass~amaxlog`


$`totalbiomass~d13c`


$`totalbiomass~d15n`


$`totalbiomass~gslog`


$`totalbiomass~pnuelog`


$`totalbiomass~wuelog`

Save lists with the models

saveRDS(models_list_q1_q2, file = "./processed_data/models_q1_q2.RData") 
saveRDS(models_list_q3, file = "./processed_data/models_q3_3_way_interaction.RData") 
saveRDS(models_list_nodule_count, file = "./processed_data/models_list_nodule_count.RData")